import numpy as np
import pandas as pd
import sklearn
import gc
import functions
import scipy
blood_atlas_colours = pd.read_csv('/Users/pwangel/Data/Metadata_dumps/imac_atlas_colours.tsv', sep='\t').set_index('Sample Source')
blood_atlas_colours = {key:value[0] for key, value in zip(blood_atlas_colours.index.values, blood_atlas_colours.values)}
Reading in data, including nadias annotations, excel spreadsheet with multiple tabs
data = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_expression_v7.1.tsv', sep='\t', index_col=0)
annotations = pd.read_csv('/Users/pwangel/PlotlyWorkspace/combine_data/blood/outputs_for_front_end/iMac_annotations.tsv', sep='\t', index_col=0)
genes = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_genes.tsv', sep='\t', index_col=0)
data = functions.transform_to_percentile(data)
Only need to compute gene variance fraction if not done already
#genes = functions.calculate_platform_dependence(data, annotations)
#genes.to_csv('/Users/pwangel/Downloads/myeloid_atlas_genes.tsv', sep='\t')
pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
pca.fit(functions.transform_to_percentile(data.loc[genes.Platform_VarFraction.values<=0.2]).transpose())
pca_coords = pca.transform(functions.transform_to_percentile(data.loc[genes.Platform_VarFraction.values<=0.2]).transpose())
functions.plot_pca(pca_coords, annotations,pca, \
labels=['celltype', 'Platform_Category', 'Dataset'], colour_dict=blood_atlas_colours)
functions.plot_gene_platform_dependence_distribution(data, annotations, genes)
functions.plot_KW_Htest(data, annotations, genes)
Assessing platform dependence for principal components with varying threshold. Analysing threshold of 0.020000 (71 genes) Analysing threshold of 0.040000 (279 genes) Analysing threshold of 0.060000 (533 genes) Analysing threshold of 0.080000 (878 genes) Analysing threshold of 0.100000 (1253 genes) Analysing threshold of 0.120000 (1690 genes) Analysing threshold of 0.140000 (2179 genes) Analysing threshold of 0.160000 (2648 genes)